{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "c7c1a6b6",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.impute import SimpleImputer"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "117328b4",
   "metadata": {},
   "source": [
    "### 使用fit_transform修改NaN值"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "8b1bd0a2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 准备修改的标准\n",
    "imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=12)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "1f28993f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "      <th>C</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>x</td>\n",
       "      <td>10.00</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>y</td>\n",
       "      <td>NaN</td>\n",
       "      <td>20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>z</td>\n",
       "      <td>30.12</td>\n",
       "      <td>30</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   A      B   C\n",
       "0  x  10.00  10\n",
       "1  y    NaN  20\n",
       "2  z  30.12  30"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = {'A': ['x', 'y', 'z'], 'B': [10.0, np.nan, 30.12], 'C': [10, 20, 30]}\n",
    "df = pd.DataFrame(data=data)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "5aeb4fcb",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.series.Series'> <class 'pandas.core.frame.DataFrame'>\n"
     ]
    }
   ],
   "source": [
    "# df[]传列表返回DataFrame(多维), 只传一个列而且不是在列表里的会返回Series(一维)\n",
    "print(type(df['A']), type(df[['A']]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "bb8f169b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "      <th>C</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>x</td>\n",
       "      <td>10.00</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>y</td>\n",
       "      <td>12.00</td>\n",
       "      <td>20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>z</td>\n",
       "      <td>30.12</td>\n",
       "      <td>30</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   A      B   C\n",
       "0  x  10.00  10\n",
       "1  y  12.00  20\n",
       "2  z  30.12  30"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[['B']] = imputer.fit_transform(df[['B']])  # fit_transform参数要二维\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "45b7edd1",
   "metadata": {},
   "source": [
    "###  筛选某列中没有空值的行"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "7856725c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "      <th>C</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>x</td>\n",
       "      <td>10.00</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>y</td>\n",
       "      <td>NaN</td>\n",
       "      <td>20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>z</td>\n",
       "      <td>30.12</td>\n",
       "      <td>30</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   A      B   C\n",
       "0  x  10.00  10\n",
       "1  y    NaN  20\n",
       "2  z  30.12  30"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = {'A': ['x', 'y', 'z'], 'B': [10.0, np.nan, 30.12], 'C': [10, 20, 30]}\n",
    "df = pd.DataFrame(data=data)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "a100acfe",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "      <th>C</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>x</td>\n",
       "      <td>10.00</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>z</td>\n",
       "      <td>30.12</td>\n",
       "      <td>30</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   A      B   C\n",
       "0  x  10.00  10\n",
       "1  z  30.12  30"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = df[df['B'].notnull()].reset_index(drop=True)\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b88808d3",
   "metadata": {},
   "source": [
    "### 对多列的数据用同一种方法修改NaN值\n",
    "-  如: 对字符串型的修改为empty"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "fa9c581a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 3 entries, 0 to 2\n",
      "Data columns (total 4 columns):\n",
      " #   Column  Non-Null Count  Dtype  \n",
      "---  ------  --------------  -----  \n",
      " 0   A       2 non-null      object \n",
      " 1   B       2 non-null      float64\n",
      " 2   C       3 non-null      int64  \n",
      " 3   D       1 non-null      object \n",
      "dtypes: float64(1), int64(1), object(2)\n",
      "memory usage: 228.0+ bytes\n"
     ]
    }
   ],
   "source": [
    "data = {\n",
    "    'A': [np.nan, 'x', 'y'],\n",
    "    'B': [10.0, np.nan, 30.12],\n",
    "    'C': [10, 20, 30],\n",
    "    'D': ['r', np.nan, np.nan]\n",
    "        }\n",
    "df = pd.DataFrame(data)\n",
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "08870ec2",
   "metadata": {},
   "outputs": [],
   "source": [
    "imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='empty')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "54d54ab4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['A', 'D'], dtype='object')"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "str_col = df.select_dtypes('object').columns  # 多个类型的话传列表\n",
    "str_col"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "1ef105ca",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "      <th>C</th>\n",
       "      <th>D</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>NaN</td>\n",
       "      <td>10.00</td>\n",
       "      <td>10</td>\n",
       "      <td>r</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>x</td>\n",
       "      <td>NaN</td>\n",
       "      <td>20</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>y</td>\n",
       "      <td>30.12</td>\n",
       "      <td>30</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     A      B   C    D\n",
       "0  NaN  10.00  10    r\n",
       "1    x    NaN  20  NaN\n",
       "2    y  30.12  30  NaN"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "70492ef0",
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "      <th>C</th>\n",
       "      <th>D</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>empty</td>\n",
       "      <td>10.00</td>\n",
       "      <td>10</td>\n",
       "      <td>r</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>x</td>\n",
       "      <td>NaN</td>\n",
       "      <td>20</td>\n",
       "      <td>empty</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>y</td>\n",
       "      <td>30.12</td>\n",
       "      <td>30</td>\n",
       "      <td>empty</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       A      B   C      D\n",
       "0  empty  10.00  10      r\n",
       "1      x    NaN  20  empty\n",
       "2      y  30.12  30  empty"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.loc[:, str_col] = imputer.fit_transform(df[str_col])  # str_col是列表, df[]返回DF\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e8dac838",
   "metadata": {},
   "source": [
    "### 根据数据范围划分标签"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "5a663434",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "      <th>C</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>x</td>\n",
       "      <td>10.00</td>\n",
       "      <td>18</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>y</td>\n",
       "      <td>NaN</td>\n",
       "      <td>25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>z</td>\n",
       "      <td>30.12</td>\n",
       "      <td>32</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>d</td>\n",
       "      <td>23.10</td>\n",
       "      <td>21</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>e</td>\n",
       "      <td>12.00</td>\n",
       "      <td>20</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   A      B   C\n",
       "0  x  10.00  18\n",
       "1  y    NaN  25\n",
       "2  z  30.12  32\n",
       "3  d  23.10  21\n",
       "4  e  12.00  20"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = {'A': ['x', 'y', 'z', 'd', 'e'], 'B': [10.0, np.nan, 30.12, 23.1, 12], 'C': [18, 25, 32, 21, 20]}\n",
    "df = pd.DataFrame(data)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "ac0cab0a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "      <th>C</th>\n",
       "      <th>num_range</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>x</td>\n",
       "      <td>10.00</td>\n",
       "      <td>18</td>\n",
       "      <td>(17.986, 22.667]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>y</td>\n",
       "      <td>NaN</td>\n",
       "      <td>25</td>\n",
       "      <td>(22.667, 27.333]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>z</td>\n",
       "      <td>30.12</td>\n",
       "      <td>32</td>\n",
       "      <td>(27.333, 32.0]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>d</td>\n",
       "      <td>23.10</td>\n",
       "      <td>21</td>\n",
       "      <td>(17.986, 22.667]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>e</td>\n",
       "      <td>12.00</td>\n",
       "      <td>20</td>\n",
       "      <td>(17.986, 22.667]</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   A      B   C         num_range\n",
       "0  x  10.00  18  (17.986, 22.667]\n",
       "1  y    NaN  25  (22.667, 27.333]\n",
       "2  z  30.12  32    (27.333, 32.0]\n",
       "3  d  23.10  21  (17.986, 22.667]\n",
       "4  e  12.00  20  (17.986, 22.667]"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['num_range'] = pd.cut(df['C'], bins=3)\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ff012fc4",
   "metadata": {},
   "source": [
    "### 自定义区间"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "e94384d8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "      <th>C</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>x</td>\n",
       "      <td>10.00</td>\n",
       "      <td>18</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>y</td>\n",
       "      <td>NaN</td>\n",
       "      <td>25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>z</td>\n",
       "      <td>30.12</td>\n",
       "      <td>32</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>ad</td>\n",
       "      <td>23.10</td>\n",
       "      <td>21</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>ae</td>\n",
       "      <td>12.00</td>\n",
       "      <td>20</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    A      B   C\n",
       "0   x  10.00  18\n",
       "1   y    NaN  25\n",
       "2   z  30.12  32\n",
       "3  ad  23.10  21\n",
       "4  ae  12.00  20"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = {'A': ['x', 'y', 'z', 'd', 'e'], 'B': [10.0, np.nan, 30.12, 23.1, 12], 'C': [18, 25, 32, 21, 20]}\n",
    "df = pd.DataFrame(data)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "35d48e4f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "      <th>C</th>\n",
       "      <th>num_range</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>x</td>\n",
       "      <td>10.00</td>\n",
       "      <td>18</td>\n",
       "      <td>(15, 20]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>y</td>\n",
       "      <td>NaN</td>\n",
       "      <td>25</td>\n",
       "      <td>(20, 25]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>z</td>\n",
       "      <td>30.12</td>\n",
       "      <td>32</td>\n",
       "      <td>(30, 35]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>d</td>\n",
       "      <td>23.10</td>\n",
       "      <td>21</td>\n",
       "      <td>(20, 25]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>e</td>\n",
       "      <td>12.00</td>\n",
       "      <td>20</td>\n",
       "      <td>(15, 20]</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   A      B   C num_range\n",
       "0  x  10.00  18  (15, 20]\n",
       "1  y    NaN  25  (20, 25]\n",
       "2  z  30.12  32  (30, 35]\n",
       "3  d  23.10  21  (20, 25]\n",
       "4  e  12.00  20  (15, 20]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['num_range'] = pd.cut(df['C'], bins=[15, 20, 25, 30, 35])\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7d43b512",
   "metadata": {},
   "source": [
    "### 标签属性是分类category"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "7384c795",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "      <th>C</th>\n",
       "      <th>num_range</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>x</td>\n",
       "      <td>10.00</td>\n",
       "      <td>18</td>\n",
       "      <td>(15, 20]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>y</td>\n",
       "      <td>NaN</td>\n",
       "      <td>25</td>\n",
       "      <td>(20, 25]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>z</td>\n",
       "      <td>30.12</td>\n",
       "      <td>32</td>\n",
       "      <td>(30, 35]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>d</td>\n",
       "      <td>23.10</td>\n",
       "      <td>21</td>\n",
       "      <td>(20, 25]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>e</td>\n",
       "      <td>12.00</td>\n",
       "      <td>20</td>\n",
       "      <td>(15, 20]</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   A      B   C num_range\n",
       "0  x  10.00  18  (15, 20]\n",
       "1  y    NaN  25  (20, 25]\n",
       "2  z  30.12  32  (30, 35]\n",
       "3  d  23.10  21  (20, 25]\n",
       "4  e  12.00  20  (15, 20]"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = {'A': ['x', 'y', 'z', 'd', 'e'], 'B': [10.0, np.nan, 30.12, 23.1, 12], 'C': [18, 25, 32, 21, 20]}\n",
    "df = pd.DataFrame(data)\n",
    "df['num_range'] = pd.cut(df['C'], bins=[15, 20, 25, 30, 35])\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "de88e499",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 5 entries, 0 to 4\n",
      "Data columns (total 4 columns):\n",
      " #   Column     Non-Null Count  Dtype   \n",
      "---  ------     --------------  -----   \n",
      " 0   A          5 non-null      object  \n",
      " 1   B          4 non-null      float64 \n",
      " 2   C          5 non-null      int64   \n",
      " 3   num_range  5 non-null      category\n",
      "dtypes: category(1), float64(1), int64(1), object(1)\n",
      "memory usage: 493.0+ bytes\n"
     ]
    }
   ],
   "source": [
    "df.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7f7fd7a1",
   "metadata": {},
   "source": [
    "### 设置标签名"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "0d92931d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "      <th>C</th>\n",
       "      <th>num_range</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>he</td>\n",
       "      <td>10.00</td>\n",
       "      <td>18</td>\n",
       "      <td>very low</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>llo</td>\n",
       "      <td>NaN</td>\n",
       "      <td>25</td>\n",
       "      <td>low</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>,wo</td>\n",
       "      <td>30.12</td>\n",
       "      <td>32</td>\n",
       "      <td>very good</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>rl</td>\n",
       "      <td>23.10</td>\n",
       "      <td>21</td>\n",
       "      <td>low</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>d!</td>\n",
       "      <td>12.00</td>\n",
       "      <td>20</td>\n",
       "      <td>very low</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     A      B   C  num_range\n",
       "0   he  10.00  18   very low\n",
       "1  llo    NaN  25        low\n",
       "2  ,wo  30.12  32  very good\n",
       "3   rl  23.10  21        low\n",
       "4   d!  12.00  20   very low"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = {'A': ['he', 'llo', ',wo', 'rl', 'd!'], 'B': [10.0, np.nan, 30.12, 23.1, 12], 'C': [18, 25, 32, 21, 20]}\n",
    "df = pd.DataFrame(data)\n",
    "df['num_range'] = pd.cut(df['C'], bins=[15, 20, 25, 30, 35], labels=['very low', 'low', 'good', 'very good'])  # 五个值有四个间隔, 要设置四个标签\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "55063cba",
   "metadata": {},
   "source": [
    "### 根据标签获取虚拟编码"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "68a7e6fd",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = {'A': ['he', 'llo', ',wo', 'rl', 'd!'], 'B': [10.0, np.nan, 30.12, 23.1, 12], 'C': [18, 25, 32, 21, 20]}\n",
    "df = pd.DataFrame(data)\n",
    "df['num_range'] = pd.cut(df['C'], bins=[15, 20, 25, 30, 35], labels=['very low', 'low', 'good', 'very good'])  # 五个值有四个间隔, 要设置四个标签"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "1235542a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 5 entries, 0 to 4\n",
      "Data columns (total 4 columns):\n",
      " #   Column     Non-Null Count  Dtype   \n",
      "---  ------     --------------  -----   \n",
      " 0   A          5 non-null      object  \n",
      " 1   B          4 non-null      float64 \n",
      " 2   C          5 non-null      int64   \n",
      " 3   num_range  5 non-null      category\n",
      "dtypes: category(1), float64(1), int64(1), object(1)\n",
      "memory usage: 461.0+ bytes\n"
     ]
    }
   ],
   "source": [
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "f58a7886",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>B</th>\n",
       "      <th>C</th>\n",
       "      <th>A_,wo</th>\n",
       "      <th>A_d!</th>\n",
       "      <th>A_he</th>\n",
       "      <th>A_llo</th>\n",
       "      <th>A_rl</th>\n",
       "      <th>num_range_very low</th>\n",
       "      <th>num_range_low</th>\n",
       "      <th>num_range_good</th>\n",
       "      <th>num_range_very good</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>10.00</td>\n",
       "      <td>18</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>NaN</td>\n",
       "      <td>25</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>30.12</td>\n",
       "      <td>32</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>23.10</td>\n",
       "      <td>21</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>12.00</td>\n",
       "      <td>20</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       B   C  A_,wo   A_d!   A_he  A_llo   A_rl  num_range_very low  \\\n",
       "0  10.00  18  False  False   True  False  False                True   \n",
       "1    NaN  25  False  False  False   True  False               False   \n",
       "2  30.12  32   True  False  False  False  False               False   \n",
       "3  23.10  21  False  False  False  False   True               False   \n",
       "4  12.00  20  False   True  False  False  False                True   \n",
       "\n",
       "   num_range_low  num_range_good  num_range_very good  \n",
       "0          False           False                False  \n",
       "1           True           False                False  \n",
       "2          False           False                 True  \n",
       "3           True           False                False  \n",
       "4          False           False                False  "
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.get_dummies(df)  # 会把字符串也拆开成为特征\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e032a12c",
   "metadata": {},
   "source": [
    "### 统计元素个数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "a7320357",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = {'lists': [[1, 2, 3], [2], ['44', '3'], []]}\n",
    "df = pd.DataFrame(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "0cb59c4a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>lists</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>[1, 2, 3]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>[2]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>[44, 3]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       lists\n",
       "0  [1, 2, 3]\n",
       "1        [2]\n",
       "2    [44, 3]\n",
       "3         []"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "a4031f79",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 4 entries, 0 to 3\n",
      "Data columns (total 1 columns):\n",
      " #   Column  Non-Null Count  Dtype \n",
      "---  ------  --------------  ----- \n",
      " 0   lists   4 non-null      object\n",
      "dtypes: object(1)\n",
      "memory usage: 164.0+ bytes\n"
     ]
    }
   ],
   "source": [
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "60dbcb42",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "list"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(df.iloc[0].iloc[0])  # 虽然info里写的是object, 还是能具体找出数据类型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "f6ee74a0",
   "metadata": {},
   "outputs": [],
   "source": [
    "df['count'] = df['lists'].map(len)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "f5a6fac9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>lists</th>\n",
       "      <th>count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>[1, 2, 3]</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>[2]</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>[44, 3]</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>[]</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       lists  count\n",
       "0  [1, 2, 3]      3\n",
       "1        [2]      1\n",
       "2    [44, 3]      2\n",
       "3         []      0"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5eea17c5",
   "metadata": {},
   "source": [
    "### 判断元素是否在当中存在"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "080898d7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>lists</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>[1, 2, 3]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>[2]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>[44, 3]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       lists\n",
       "0  [1, 2, 3]\n",
       "1        [2]\n",
       "2    [44, 3]\n",
       "3         []"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = {'lists': [[1, 2, 3], [2], ['44', '3'], []]}\n",
    "df = pd.DataFrame(data)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "c9f56929",
   "metadata": {},
   "outputs": [],
   "source": [
    "df['two_flag'] = df['lists'].map(lambda x: 1 if 2 in x else 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "ce427710",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>lists</th>\n",
       "      <th>two_flag</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>[1, 2, 3]</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>[2]</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>[44, 3]</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>[]</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       lists  two_flag\n",
       "0  [1, 2, 3]         1\n",
       "1        [2]         1\n",
       "2    [44, 3]         0\n",
       "3         []         0"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f5c2209f",
   "metadata": {},
   "source": [
    "### 从字符串中提取标签"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "063d8e51",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>strs</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>#hhh#yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>#hello</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>#what#is#wrong</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             strs\n",
       "0        #hhh#yes\n",
       "1          #hello\n",
       "2  #what#is#wrong\n",
       "3                "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = {'strs': ['#hhh#yes', '#hello', '#what#is#wrong', '']}\n",
    "df = pd.DataFrame(data)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "1a68aa1f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0           [, hhh, yes]\n",
       "1              [, hello]\n",
       "2    [, what, is, wrong]\n",
       "3                     []\n",
       "Name: strs, dtype: object"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['strs'].str.split('#')  # 返回Series"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "28016949",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td></td>\n",
       "      <td>hhh</td>\n",
       "      <td>yes</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td></td>\n",
       "      <td>hello</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td></td>\n",
       "      <td>what</td>\n",
       "      <td>is</td>\n",
       "      <td>wrong</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td></td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  0      1     2      3\n",
       "0      hhh   yes   None\n",
       "1    hello  None   None\n",
       "2     what    is  wrong\n",
       "3     None  None   None"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = df['strs'].str.split('#', expand=True)  # 以数量最多的为列数\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "e1b0f78d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>hhh</td>\n",
       "      <td>yes</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>hello</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>what</td>\n",
       "      <td>is</td>\n",
       "      <td>wrong</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       1     2      3\n",
       "0    hhh   yes   None\n",
       "1  hello  None   None\n",
       "2   what    is  wrong\n",
       "3   None  None   None"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 最前面的空格也被分隔开了，需要去除\n",
    "df = df.drop(columns=[0])\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "6260f1a7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>tag1</th>\n",
       "      <th>tag2</th>\n",
       "      <th>tag3</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>hhh</td>\n",
       "      <td>yes</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>hello</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>what</td>\n",
       "      <td>is</td>\n",
       "      <td>wrong</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    tag1  tag2   tag3\n",
       "0    hhh   yes   None\n",
       "1  hello  None   None\n",
       "2   what    is  wrong\n",
       "3   None  None   None"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.columns = ['tag1', 'tag2', 'tag3']  # 修改列名\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "43b02df4",
   "metadata": {},
   "source": [
    "### 统计每行缺失值的个数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "id": "1231069d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>tag1</th>\n",
       "      <th>tag2</th>\n",
       "      <th>tag3</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>hhh</td>\n",
       "      <td>yes</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>hello</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>what</td>\n",
       "      <td>is</td>\n",
       "      <td>wrong</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    tag1  tag2   tag3\n",
       "0    hhh   yes   None\n",
       "1  hello  None   None\n",
       "2   what    is  wrong\n",
       "3   None  None   None"
      ]
     },
     "execution_count": 83,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = {'strs': ['#hhh#yes', '#hello', '#what#is#wrong', '']}\n",
    "df = pd.DataFrame(data)\n",
    "df = df['strs'].str.split('#', expand=True).drop(columns=[0])\n",
    "df.columns = ['tag1', 'tag2', 'tag3']\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "id": "72edfc51",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tag1    1\n",
       "tag2    2\n",
       "tag3    3\n",
       "dtype: int64"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.isnull().sum() # 默认是统计每列的空值个数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "id": "6577b883",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>tag1</th>\n",
       "      <th>tag2</th>\n",
       "      <th>tag3</th>\n",
       "      <th>missing</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>hhh</td>\n",
       "      <td>yes</td>\n",
       "      <td>None</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>hello</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>what</td>\n",
       "      <td>is</td>\n",
       "      <td>wrong</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    tag1  tag2   tag3  missing\n",
       "0    hhh   yes   None        1\n",
       "1  hello  None   None        2\n",
       "2   what    is  wrong        0\n",
       "3   None  None   None        3"
      ]
     },
     "execution_count": 84,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['missing'] = df.isnull().sum(axis=1)\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "224efca6",
   "metadata": {},
   "source": [
    "### 将数字字符串删除间隔并转换为整型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "id": "2f7dffe0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>100,000,000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>12,321,111</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>23,000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>224</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         count\n",
       "0  100,000,000\n",
       "1   12,321,111\n",
       "2       23,000\n",
       "3          224"
      ]
     },
     "execution_count": 86,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = {'count': ['100,000,000', '12,321,111', '23,000', '224']}\n",
    "df = pd.DataFrame(data)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "id": "59618d07",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 4 entries, 0 to 3\n",
      "Data columns (total 1 columns):\n",
      " #   Column  Non-Null Count  Dtype \n",
      "---  ------  --------------  ----- \n",
      " 0   count   4 non-null      object\n",
      "dtypes: object(1)\n",
      "memory usage: 164.0+ bytes\n"
     ]
    }
   ],
   "source": [
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "id": "74e41726",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>100000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>12321111</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>23000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>224</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       count\n",
       "0  100000000\n",
       "1   12321111\n",
       "2      23000\n",
       "3        224"
      ]
     },
     "execution_count": 88,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['count'] = df['count'].str.replace(',', '').astype(int)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "id": "1782d092",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 4 entries, 0 to 3\n",
      "Data columns (total 1 columns):\n",
      " #   Column  Non-Null Count  Dtype\n",
      "---  ------  --------------  -----\n",
      " 0   count   4 non-null      int32\n",
      "dtypes: int32(1)\n",
      "memory usage: 148.0 bytes\n"
     ]
    }
   ],
   "source": [
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "59fd692f",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
